Spark2.3.1+Kafka
使用Direct
模式消费信息
Maven
依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-8_2.11</artifactId>
<version>2.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.1</version>
</dependency>
2.3.1
即spark
版本
Direct
模式代码
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
object Test {
val zkQuorum = "mirrors.mucang.cn:2181"
val groupId = "nginx-cg"
val topic = Map("nginx-log" -> 1)
val KAFKA_INTERVAL = 10
case class NginxInof(domain: String, ip: String)
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("NginxLogAnalyze").setMaster("local[*]")
val sparkContext = new SparkContext(sparkConf)
val streamContext = new StreamingContext(sparkContext, Seconds(KAFKA_INTERVAL))
val kafkaParam = Map[String, String](
"bootstrap.servers" -> "xx.xx.cn:9092",
"group.id" -> "nginx-cg",
"auto.offset.reset" -> "largest"
)
val topic = Set("nginx-log")
val kafkaStream = KafkaUtils.createDirectStream(streamContext, kafkaParam, topic)
val counter = kafkaStream
.map(_.toString().split(" "))
.map(item => (item(0).split(",")(1) + "-" + item(2), 1))
.reduceByKey((x, y) => (x + y))
counter.foreachRDD(rdd => {
rdd.foreach(println)
})
streamContext.start()
streamContext.awaitTermination()
}
}
largest
因为kafka
版本过低不支持latest
异常信息
Caused by: java.lang.NoSuchMethodException: scala.runtime.Nothing$.<init>(kafka.utils.VerifiableProperties)
at java.lang.Class.getConstructor0(Class.java:3082)
at java.lang.Class.getConstructor(Class.java:1825)
at org.apache.spark.streaming.kafka.KafkaRDD$KafkaRDDIterator.<init>(KafkaRDD.scala:153)
at org.apache.spark.streaming.kafka.KafkaRDD.compute(KafkaRDD.scala:136)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
... 3 more
解决方案
在验证kafka
属性时不能使用scala
默认的类,需要指定kafka
带的类createDirectStream[String, String, StringDecoder, StringDecoder]
其中StringDecoder必须是kafka.serializer.StringDecoder
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。